
#####################################################
##### Analysis with dictionary, EU speeches  ########
#####################################################

# OVERVIEW, what we do here:
# I. Load, check and inspect supercorpus, compile descriptive statistics
# II. Apply dictionary for scaling

# clean environment
rm(list = ls())

# Set working directory and put the "Data" folder and all files in "RData_for_directory" here!
setwd("")


# load packages
library(quanteda)
library(stringr)
library(dplyr)
library(XML)
library(RCurl)
library(xml2)
library(ellipse)
library(extrafont)
library(plotrix)
library(ggplot2)
library(plyr)
library(tidyr)
library(stargazer)
library(xtable)
library(tibble)
library(car)
library(data.table)

# NOTE: all speeches in "Data" folder were collected with webscraping techniques in R
# (scripts available upon request)
# for sources of the speeches, please refer to Table 1 in our paper 
# published in May 2019 in Quality and Quantity (https://rdcu.be/bBlQw)


# first, lets check how many R-data files are in our data folder
files <- list.files("1Data/", pattern=".RData")


# load each single .RData file in the folder "1Data" at once and build "supercorpus"
fileDir <- "1Data"
files <- list.files(fileDir,pattern="*.RData", full.names=TRUE)

# make sure to update here total number of cases (one more actually...)
for(i in 1:length(files)){
  load(files[i])
  if(i==1){supercorpus <- speech_corpus}
  if(i>1 & i<41){supercorpus <- supercorpus + speech_corpus}
  if(i>41){
    data$documents[[which(is.na(names(data$documents)))]] <- NULL
    supercorpus <- supercorpus + speech_corpus}
}



#################################################################

# I. check and inspect supercorpus - what kind of data do we have:

#################################################################

# check if all speakers (our cases !) are included in supercorpus
speaker <- docvars(supercorpus, "speaker")
options(max.print=5000)
speaker
speaker_unique <- unique(speaker)
speaker_unique

# see all countries (not our cases !) 
country <- docvars(supercorpus, "country")
country_unique <- unique(country)
country_unique

# see how many speeches were translated
language <- docvars(supercorpus, "language")
language


# show date range of all speeches
date <- docvars(supercorpus, "date")
min(date)
max(date)

# optimize speaker docvars by adding country name for better plotting
# do this only for data overview table
speaker <- sprintf("%s - ", speaker)
speaker
speaker <- paste(speaker, country, collapse = ',')
#speaker <- paste(speaker, collapse = ',')
speaker <- str_split(speaker, ",")
speaker <- unlist(speaker)
# assign new speaker variable to supercorpus
docvars(supercorpus, "speaker") <- speaker

# get summary of supercorpus (first line also shows overall number of speeches)
summary(supercorpus)


########################################

# II. Apply dictionary, plot the results

########################################


# turn into dfm and do only very basic cleaning
# (do further cleaning only for unsupervised methods)
# Stemmed or not, stopwords removed or not, this makes a difference in dictionary results!
# we want ALL scores which is why we do not clean beside tolower and remove_punct

# YET, as an additional filter, we erase collocations which include dictionary terms in misleading contexts (ex.: "all rights reserved", identified by key-word-in-context analysis with Yoshikoder)
# because we cannot directly erase such collocations from the corpus (quanteda is built on different principles, only tokenized items or dfms can be cleaned), 
# we first turn such collocations into so-called 'compound tokens' and erase them from the tokenized supercorpus before turning it into a dfm.

toks <- tokens(supercorpus)
# build the compound tokens, capitalization does not matter here!
toks <- tokens_compound(toks, pattern = phrase(c('rights reserved', 'free trade', 'no choice', 'European family', 'for free', 'Freedom House', 'market liberalization',
                                                 'Gayle Smith', 'approximately equal', 'about equal', 'fairly large', 'Islamic Solidarity', 'addiction heroin', 'heroin users', 'heroin problem',
                                                 'illegal referendum', 'Conference Islam', 'Interfaith Conference', 'illegal annexation', 'illegally annexed', 'Islam Karimov')))
# check again the context
#kw_comp <- kwic(toks, pattern = c('rights_reserved', 'free_trade'))
#head(kw_comp, 50)

# now, remove from toks (plus other single words which conflate contextual meaning!)
toks <- tokens_remove(toks, c('rights_reserved', 'free_trade', 'no_choice', 'European_family', 'for_free', 'Freedom_House', 'market_liberalization',
                              'Gayle_Smith', 'approximately_equal', 'about_equal', 'fairly_large', 'Islamic_Solidarity', 'freestyle', 'Gaymagli', 'Gayibov',
                              'addiction_heroin', 'illegal_referendum', 'freez*', 'freeze', 'sexta', 'heroin_users', 'heroin_problem', 'conference_islam', 'interfaith_conference',
                              'dialectical', 'illegal_annexation', 'illegally_annexed', 'islam_karimov', 'herod'))

# now turn toks (!) into dfm with minimal cleaning
mydfm <- dfm(toks,
             tolower = TRUE,
             remove_punct = TRUE)


# now group the dfm by speaker 
speaker_dfm <- dfm_group(mydfm, groups = "speaker")
ndoc(speaker_dfm)


# load dictionary
#mydictio <- dictionary(file = "audemdic_valid.ykd", format = "yoshikoder", concatenator = "_",
#                       tolower = TRUE, encoding = "auto")
#mydictio

# for now, use function proposed by Benoit (current quanteda version has a bug again)
# see here: https://stackoverflow.com/questions/43819992/quanteda-applying-yoshikoder-dictionary-with-multiple-levels
read_dict_yoshikoder <- function(path, sep=">"){
  doc <- xml2::read_xml(path)
  pats <- xml2::xml_find_all(doc, ".//pnode")
  pnode_names <- xml2::xml_attr(pats, "name")  
  get_pnode_path <- function(pn) {
    pars <- xml2::xml_attr(xml2::xml_parents(pn), "name")
    paste0(rev(na.omit(pars)), collapse = sep)
  }
  pnode_paths <- lapply(pats, get_pnode_path)
  lst <- split(pnode_names, unlist(pnode_paths))
  dictionary(lst)
}

mydictio <- read_dict_yoshikoder("Dic_valid.ykd")

# make categories to character list to check and count terms in each dictionary category
wordcount <- unlist(mydictio)
wordcount

# get number of all terms in dictionary
length(unlist(mydictio))


############################################
# Apply dictionary to supercorpus #
############################################


# apply dictionary on speaker_filter_dfm, dic(tionary)dfm = dicdfm
dicdfm <- dfm(speaker_dfm, dictionary = mydictio)
dicdfm


# implement statistical model for each of the two subcategories
# but first, transform into data frame (easier to extract info)

# adjust number of documents/groups here (nrow)

model <- matrix(dicdfm, nrow = 40, ncol = 4)
model <- as.data.frame(model)
model[,5] <- docvars(dicdfm, "regime")
model[,6] <- row.names(dicdfm)
colnames(model)[6] <- "ID"
# make ID rownames
row.names(model) <- model[,6]
model[,6] <- NULL
model

# add up categories (illiberalism: nationalism, paternalism + traditionalism, and for liberalism: liberal values + woman, minorities, cf. dictionary)
model$illiberal <- model$V1 + model$V2
model$liberal <- model$V3 + model$V4
model

# erase other colums
model[,c(1:4)] <- NULL
model


# apply model to estimate scale positions illiberal vs liberal rhetoric

# apply model, test in one case first:
# q is the estimate of the position on the liberlism-conservatism scale
q <- log((model$illiberal[1]+0.5)/(model$liberal[1]+0.5))
q
# sigi are sigma values for the first dimension (i =ideological orientation)
# sqrt is the function which computes the square root of a numeric vector
sigi <- sqrt((model$illiberal[1]+0.5)^-1 + (model$liberal[1]+0.5)^-1)
sigi

# now for all cases as loop
n <- nrow(model)
for (i in 1:n){
  Illib <- model$illiberal[i]
  Lib <- model$liberal[i]
  q[i] <- log((Lib+0.5)/(Illib+0.5))
  sigi[i] <- sqrt((model$liberal[i]+0.5)^-1 + (model$illiberal[i]+0.5)^-1)
}
q

# add scale for first dimension and sigma values as columns to model
model$scaleideo <- q
model$sigideo <- sigi
model


# now, calculate the confidence intervals on the x-axis 
# based on the interval formula as suggested by Lowe (2011) 
# [q - 1.96xsigma, q + 1.96xsigma]
model$xneg <- c(model$scaleideo-(1.96*model$sigideo))
model$xpos <- c(model$scaleideo+(1.96*model$sigideo))
model

# rename column with regime information
colnames(model)[colnames(model)=="V5"] <- "autodemo"
# export model with scale scores and names of speakers only (but change speaker settings above first, no country added...)
#export <- model[c(1,4)]
#export[,1] <- rownames(export)
#save(export, file = "model_speaker.RData")



# construct boxplot as first overview on dictionary findings:
#Data for boxplot:
set.seed(1234)

names = model$autodemo
value=model$scaleideo
data=model

#Boxplot:
#qplot( x=names , y=value , data=data , geom=c("boxplot","jitter") , fill=names, show.legend = FALSE)+
g <- ggplot(data, aes(x=names, y=value, fill=names), show.legend = FALSE)+
  geom_boxplot() +
  guides(fill=FALSE)+
  scale_fill_manual(values=c("#FF3300", "#00CC00"))+
  xlab("") + ylab("Illiberal vs Liberal Rhetoric")+
  theme_bw(base_size = 15) +
  theme(panel.grid = element_blank())+
  coord_flip()


#function that takes in vector of data and a coefficient,
#returns boolean vector if a certain point is an outlier or not
check_outlier <- function(v, coef=1.5){
  quantiles <- quantile(v,probs=c(0.25,0.75))
  IQR <- quantiles[2]-quantiles[1]
  res <- v < (quantiles[1]-coef*IQR)|v > (quantiles[2]+coef*IQR)
  return(res)
}

#apply this to our data and labe our only outlier (which is Orban as per value)
dat <- data.table(group=data$autodemo,value=data$scaleideo)
dat[,outlier:=check_outlier(value),by=group]
dat[,label:=ifelse(outlier,"Viktor Orb�n - Hungary", "")]

#plot
ggplot(dat,aes(x=group,y=value, fill=names), show.legend = FALSE)+
  geom_boxplot()+
  guides(fill=FALSE)+
  scale_fill_manual(values=c("#FF3300", "#00CC00"))+
  xlab("") + ylab("Illiberal vs Liberal Rhetoric")+
  theme_bw(base_size = 15) +
  theme(panel.grid = element_blank())+
  geom_text(aes(label=label),hjust=1.1, size=3)+
  coord_flip()



# now, construct the scale:
# first, optimize regime types
ord <- c("Autocracy", "Democracy")
model$autodemo <- factor(model$autodemo,levels=ord)

# now make nice plot with ggplot2
theme_set(theme_bw())

# plot with only two regime types:
autodemo_plot <- ggplot(model, aes(reorder(rownames(model), scaleideo), scaleideo, label=scaleideo, shape= autodemo, color = autodemo)) + 
  geom_point() +
  geom_ribbon(aes(ymin = xneg, ymax = xpos, x = reorder(rownames(model), scaleideo)), alpha = 0) +
  scale_shape_manual(name = "Regime Types:", values = c(8, 16)) + 
  scale_colour_manual(name = "Regime Types:", values = c("#FF3300", "#00CC00")) +
  labs(y = "Illiberal vs Liberal Rhetoric", x= "") +
  ylim(-2.7, 1.5) +
  geom_hline(yintercept = 0) +
  theme(legend.position="bottom") +
  theme(text=element_text(family = "serif", color = "black")) +
  coord_flip()

autodemo_plot
# ok

# continue with unsupervised techniques in next script!
